Import useful stuff and define ancillary functions


In [23]:
%pylab inline
%load_ext autoreload
%autoreload 2

from __future__ import division

from collections import defaultdict, namedtuple
import cPickle as pickle
from datetime import datetime, timedelta
import dateutil
from functools import partial
import inspect
import json
import os
import re
import sys

import numpy as np
import pandas as pd


Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
WARNING: pylab import has clobbered these variables: ['datetime']
`%matplotlib` prevents importing * from pylab and numpy

Load train data

Using pandas' read_csv with all the defaults


In [2]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed_float_string_date.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
    PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_float_string_date.csv'
    TEST_PATH = r'/media/mtambos/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
#df_orig = pd.read_csv(TRAIN_PATH, index_col="ID")
df = pd.read_csv(PTRAIN_PATH, index_col="ID")
#df


/home/mtambos/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,23,136,172,190,193,194,196,200,203) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Define columns


In [3]:
date_cols = ['VAR_0073', 'VAR_0075', 'VAR_0156',
             'VAR_0157', 'VAR_0158', 'VAR_0159',
             'VAR_0166', 'VAR_0167', 'VAR_0168',
             'VAR_0169', 'VAR_0176', 'VAR_0177',
             'VAR_0178', 'VAR_0179', 'VAR_0204',
             'VAR_0217', 'VAR_0294', 'VAR_0314']

Parse the weird date format of the date column


In [6]:
def parse_date_str(date_val):
    if isinstance(date_val, datetime):
        return date_val
    date_val = str(date_val).lower()
    date_val = None if date_val == 'nan' else date_val
    if date_val is None:
        return pd.NaT
    date_val = date_val[:-2] if date_val[-2:] == '.0' else date_val
    try:
        return datetime.strptime(date_val, '%d%b%y:%H:%M:%S')
    except:
        try:
            return datetime.strptime(date_val, '%d%b%y')
        except:
            try:
                return datetime.strptime(date_val, '%Y')
            except:
                print date_val
                return pd.NaT

df_date_cols = df[date_cols].applymap(parse_date_str).astype('datetime64[ns]')
df_date_cols.describe()


Out[6]:
VAR_0073 VAR_0075 VAR_0156 VAR_0157 VAR_0158 VAR_0159 VAR_0166 VAR_0167 VAR_0168 VAR_0169 VAR_0176 VAR_0177 VAR_0178 VAR_0179 VAR_0204 VAR_0217 VAR_0294 VAR_0314
count 44104 145175 5870 920 2089 5870 14230 2567 10725 14230 17532 3358 12073 17532 145175 145175 93943 88770
unique 1458 2371 730 424 407 650 2145 853 1645 1908 2163 945 1648 1875 1192 397 17 17
top 2009-03-13 00:00:00 2010-09-22 00:00:00 2011-12-12 00:00:00 2012-07-27 00:00:00 2012-06-01 00:00:00 2012-06-04 00:00:00 2011-12-19 00:00:00 2011-11-03 00:00:00 2011-11-03 00:00:00 2011-12-19 00:00:00 2011-12-12 00:00:00 2011-11-03 00:00:00 2011-11-03 00:00:00 2012-06-04 00:00:00 2014-01-31 15:54:00 2011-12-06 02:00:00 2012-01-01 00:00:00 2012-01-01 00:00:00
freq 260 1168 45 8 28 49 45 27 206 48 67 28 200 83 242 893 51338 48483
first 2008-01-02 00:00:00 2001-01-01 00:00:00 2008-04-11 00:00:00 2008-10-17 00:00:00 2008-09-30 00:00:00 2008-09-23 00:00:00 2002-07-30 00:00:00 2005-02-12 00:00:00 1999-12-31 00:00:00 2002-07-30 00:00:00 2002-07-30 00:00:00 2005-02-12 00:00:00 1999-12-31 00:00:00 2002-07-30 00:00:00 2014-01-29 18:45:00 2011-10-02 02:00:00 1995-01-01 00:00:00 1995-01-01 00:00:00
last 2012-10-31 00:00:00 2012-11-01 00:00:00 2012-10-29 00:00:00 2012-10-31 00:00:00 2012-10-29 00:00:00 2012-10-29 00:00:00 2012-10-30 00:00:00 2012-10-30 00:00:00 2012-11-01 00:00:00 2012-11-01 00:00:00 2012-10-30 00:00:00 2012-10-31 00:00:00 2012-11-01 00:00:00 2012-11-01 00:00:00 2014-02-01 01:26:00 2012-11-01 02:00:00 2012-01-01 00:00:00 2012-01-01 00:00:00

In [11]:
df[date_cols] = df_date_cols

Drop duplicate rows


In [8]:
df.drop_duplicates(inplace=True)

Drop duplicate columns


In [13]:
cols_to_drop = set()
for i, col in enumerate(date_cols):
    for col2 in date_cols[i+1:]:
        if (df[col] == df[col2]).all():
            cols_to_drop.add(col2)
cols_to_drop


Out[13]:
set()

Remove columns with only NaNs


In [14]:
nan_cols = df.isnull().all()
nan_cols = nan_cols.index[nan_cols].tolist()
nan_cols


Out[14]:
[]

Vectorize Datetime colums

Create dataframes to separately store the year, month and day information of the date columns


In [17]:
df[date_cols] = df[date_cols].astype('datetime64[ns]')
years = pd.DataFrame(columns=[c+'_year' for c in date_cols], index=df.index, dtype=np.int)
months = pd.DataFrame(columns=[c+'_month' for c in date_cols], index=df.index, dtype=np.int)
days = pd.DataFrame(columns=[c+'_day' for c in date_cols], index=df.index, dtype=np.int)

for c in date_cols:
    dateIndex = pd.DatetimeIndex(df[c])
    years[c+'_year'] = dateIndex.year
    months[c+'_month'] = dateIndex.month
    days[c+'_day'] = dateIndex.day

Delete the original date columns and join the years, months and days DataFrames with the original DataFrame


In [18]:
df = df.drop(date_cols, axis=1)
df = df.join(years)
df = df.join(months)
df = df.join(days)

In [19]:
date_cols = years.columns.tolist() + months.columns.tolist() + days.columns.tolist()

Eliminate columns with 0 variance

See which columns have low standard deviation


In [21]:
df_desc = df.describe()
df_desc[sorted(df_desc.columns, key=lambda x: df_desc.loc['std', x])]


Out[21]:
VAR_0733 VAR_0736 VAR_0737 VAR_0739 VAR_0204_year VAR_0294_month VAR_0314_month VAR_0294_day VAR_0314_day VAR_0526 ... VAR_0330 VAR_0289 VAR_0315 VAR_0295 VAR_0293 VAR_0313 VAR_0340 VAR_0331 VAR_0333 VAR_0212
count 145231 145231 145231 145231 145175 93943 88770 93943 88770 145231.000000 ... 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05 1.452310e+05
mean 0 0 0 0 2014 1 1 1 1 0.000007 ... 4.803098e+04 5.442426e+04 9.795514e+04 1.031416e+05 1.080350e+05 1.062522e+05 -1.161031e+04 9.404519e+04 8.855007e+04 5.168069e+10
std 0 0 0 0 0 0 0 0 0 0.002624 ... 9.788183e+05 1.118892e+06 1.221113e+06 1.291840e+06 1.418194e+06 1.453820e+06 1.512625e+06 1.551878e+06 1.567272e+06 2.482392e+10
min 0 0 0 0 2014 1 1 1 1 0.000000 ... -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -3.444084e+08 -1.000000e+00 -1.000000e+00 7.001162e+09
25% 0 0 0 0 2014 1 1 1 1 0.000000 ... -1.000000e+00 -1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 -1.000000e+00 0.000000e+00 0.000000e+00 3.024070e+10
50% 0 0 0 0 2014 1 1 1 1 0.000000 ... -1.000000e+00 -1.000000e+00 0.000000e+00 0.000000e+00 1.060000e+04 6.910000e+03 -1.000000e+00 0.000000e+00 0.000000e+00 4.442527e+10
75% 0 0 0 0 2014 1 1 1 1 0.000000 ... 0.000000e+00 -1.000000e+00 8.330000e+04 8.852700e+04 7.880000e+04 7.276400e+04 -1.000000e+00 4.215500e+04 4.920000e+04 7.543025e+10
max 0 0 0 0 2014 1 1 1 1 1.000000 ... 1.910000e+08 1.910000e+08 2.415000e+08 2.415000e+08 2.415000e+08 2.415000e+08 1.006916e+08 3.444094e+08 3.444094e+08 9.940328e+10

8 rows × 1909 columns

Eliminate all columns with standard deviation equal to 0


In [27]:
std_series = df_desc.loc['std', :]
null_std_cols = std_series[std_series == 0]
df = df.drop(null_std_cols.index, axis=1)

Save preprocessed data to another csv file


In [28]:
df.to_csv(PTRAIN_PATH)

In [24]:
with open('date_cols.pickle', 'wb') as fp:
    pickle.dump(date_cols, fp)